Train 80 %-> real train 75% ,weight train 5% (weight from t-1)

test 20%

Code
library(tidyverse)
library(janitor)
library(ggplot2)
library(dplyr)
library(rugarch)
library(gganimate)
Code
stock <- read.csv("data/individual_book_train/stock_100.csv")
Code
stock <- stock %>% mutate(
  WAP = (bid_price1 * ask_size1 + ask_price1 * bid_size1) / (bid_size1 + ask_size1)
  )

stock <- stock %>% mutate(
  BidAskSpread = ask_price1 / bid_price1 - 1
  )

stock <- stock %>% mutate(
  imbalance = abs((bid_size1 - ask_size1) / (bid_size1 + ask_size1))
  )


log_rs <- list()
imba_mean <- vector()
BAS_mean <- vector()

#time_IDs <- unique(stock$time_id)
time_IDs <- unique(stock[, 1])[1:5]


for (i in 1 : length(time_IDs)) {
  sec <- stock %>% filter(time_id == time_IDs[i]) %>% pull(seconds_in_bucket)
  price <- stock %>% filter(time_id == time_IDs[i]) %>% pull(WAP)
  imbad <- stock %>% filter(time_id == time_IDs[i]) %>% pull(imbalance)
  BASD <- stock %>% filter(time_id == time_IDs[i]) %>% pull(BidAskSpread)
  
  imba_mean[[i]] <- mean(imbad) 
  BAS_mean[[i]] <- mean(BASD)
  
  log_r <- log(price[-1] / price[1:(length(price) - 1)])
  log_rs[[i]] <- data.frame(time = sec[-1], log_return = log_r)
  time.no.change <- (1:600)[!(1:600 %in% log_rs[[i]]$time)]
  if (length(time.no.change) > 0) {
    new.df <- data.frame(time = time.no.change, log_return = 0)
    log_rs[[i]] <- rbind(log_rs[[i]], new.df)
    log_rs[[i]] <- log_rs[[i]][order(log_rs[[i]]$time), ]
  }
}

vol <- list()
comp_vol <- function(x) {
  return(sqrt(sum(x ^ 2)))
}
for (i in 1 : length(log_rs)) {
  log_rs[[i]] <- log_rs[[i]] %>% mutate(time_bucket = ceiling(time / 30))
  vol[[i]] <- aggregate(log_return ~ time_bucket, data = log_rs[[i]], FUN = comp_vol)
  colnames(vol[[i]]) <- c('time_bucket', 'volatility')
}

Cluster

Code
cluster_l <- vector()
for (i in 1:length(vol)) {
  if (BAS_mean[[i]] > 0.15){cluster_l <- 4}
  else if (imba_mean[[i]] > 0.61) {cluster_l[[i]] <- 3}
  else if (imba_mean[[i]] < 0.45) {cluster_l[[i]] <- 2}
  else {cluster_l[[i]] <- 1}
}

model

egarch

Code
spec <- ugarchspec(variance.model = list(model = "eGARCH", garchOrder = c(1, 1)), 
                   mean.model = list(armaOrder = c(0, 0)), 
                   distribution.model = "norm")
ARMA_GARCH.models <- list()

# filter time 450 for first 75% train
for (i in 1 : length(vol)) {
  ARMA_GARCH.models[[i]] <- ugarchfit(spec = spec, data = log_rs[[i]] %>% 
                                        filter(time <= 450) %>% pull(log_return),
                                      solver = 'hybrid')
}

# 30weight 120 real predict
n_w = 30
n_p = 120

garch_weight <- vector()
pred1 <- list()
pred1_adjust <- list()
for (i in 1 : length(vol)) {

  fitted <- rep(1,n_p)
  pred1[[i]] <- data_frame(fitted)
  fitted <- rep(1:4)
  pred1_adjust[[i]] <- data_frame(fitted)
  
  fspec <- getspec(ARMA_GARCH.models[[i]])
  setfixed(fspec) <- as.list(coef(ARMA_GARCH.models[[i]]))
  future.path <- fitted(ugarchpath(fspec, n.sim = 150, m.sim = 1000))
  future.path[is.na(future.path)] <- 0
  
  garch_weight[i] <- mean(sqrt(colSums(future.path[1:30,]^2)))
  pred1_adjust[[i]]$fitted[1] <- mean(sqrt(colSums(future.path[31:60,]^2)))
  pred1_adjust[[i]]$fitted[2] <- mean(sqrt(colSums(future.path[61:90,]^2)))
  pred1_adjust[[i]]$fitted[3] <- mean(sqrt(colSums(future.path[91:120,]^2)))
  pred1_adjust[[i]]$fitted[4] <- mean(sqrt(colSums(future.path[121:150,]^2)))

}

wlr

Code
vol.train <- list()
vol.val <- list()
vol.w <- list()

for (i in 1 : length(log_rs)) {
  vol.train[[i]] <- vol[[i]][1:15, ]
  vol.val[[i]] <- vol[[i]][-(1:15), ]
}

list.reg <- list()
stocklm <- stock %>% mutate(time_bucket = ceiling(seconds_in_bucket / 30),
                            num_order = bid_size1 + ask_size1 + bid_size2 + ask_size2)
len.train <- length(vol.train[[1]]$volatility)

for (i in 1 : length(vol)) {
  stats.bucket <- stocklm %>% 
    filter(time_id == time_IDs[i] & time_bucket != 0) %>% 
    select(c(BidAskSpread, WAP, num_order, time_bucket)) 
  mean.price <- aggregate(WAP ~ time_bucket, data = stats.bucket, FUN = mean)
  mean.order <- aggregate(num_order ~ time_bucket, data = stats.bucket, FUN = mean)
  mean.BAS <- aggregate(BidAskSpread ~ time_bucket, data = stats.bucket, FUN = mean)
  list.reg[[i]] <- data.frame(volatility = vol.train[[i]]$volatility[-1], 
                              price = mean.price$WAP[1:(len.train - 1)],
                              order = mean.order$num_order[1:(len.train - 1)],
                              BidAskSpread = mean.BAS$BidAskSpread[1:(len.train - 1)])
}

lm.models <- list()

for (i in 1 : length(vol)) {
  lm.models[[i]] <- lm(volatility ~ price + order + BidAskSpread, list.reg[[i]],
                       weights = 0.8 ^ (((len.train - 2):0) / 2))
}
Code
list.reg.val <- list()
len.val <- length(vol.val[[1]]$volatility)
pred.lm <- list()

for (i in 1 : length(vol)) {
  stats.bucket <- stocklm %>% 
    filter(time_id == time_IDs[i] & time_bucket != 0) %>% 
    select(c(BidAskSpread, WAP, num_order, time_bucket))
  mean.price <- aggregate(WAP ~ time_bucket, data = stats.bucket, FUN = mean)
  mean.order <- aggregate(num_order ~ time_bucket, data = stats.bucket, FUN = mean)
  mean.BAS <- aggregate(BidAskSpread ~ time_bucket, data = stats.bucket, FUN = mean)
  list.reg.val[[i]] <- 
    data.frame(volatility = vol.val[[i]]$volatility, 
               price = mean.price$WAP[len.train:(len.train + len.val - 1)],
               order = mean.order$num_order[len.train:(len.train + len.val - 1)],
               BidAskSpread = mean.BAS$BidAskSpread[len.train:(len.train + len.val - 1)])
  pred.lm[[i]] <- predict(lm.models[[i]], newdata = list.reg.val[[i]])
}
pred2 <- pred.lm

hav

Code
list.HAV <- list()

for (i in 1 : length(vol)) {
  mean.vol <- rep(0, len.train - 5)
  for (j in 1 : 5) {
    mean.vol <- mean.vol + vol.train[[i]]$volatility[j : (j + len.train - 6)] / 5
  }
  list.HAV[[i]] <- data.frame(vol = vol.train[[i]]$volatility[-(1:5)], 
                              vol_1 = vol.train[[i]]$volatility[5:(len.train - 1)],
                              mean_vol_5 = mean.vol)
}


quar <- list()
comp_quar <- function(x) {
  return(length(x) / 3 * sum(x ^ 4))
}
for (i in 1 : length(log_rs)) {
  quar[[i]] <- aggregate(log_return ~ time_bucket, data = log_rs[[i]], FUN = comp_quar)
  colnames(quar[[i]]) <- c('time_bucket', 'quarticity')
}

HAV.wls.models <- list()

for (i in 1 : length(vol)) {
  HAV.wls.models[[i]] <- lm(vol ~ vol_1 + mean_vol_5, list.HAV[[i]],
                            weights = list.HAV[[i]]$vol_1 / 
                              sqrt(quar[[i]]$quarticity[5:(len.train - 1)]))
}
Code
pred.hav.all <- list()
for (j in 1:1) {
  pred.hav <- list()
  latest_obs <- list()
  list_HAV1_cluster <- list()
  for (i in 1:length(vol)) {
    # This will predict 16, 17, 18, 19, 20
    latest_obs[[i]] <- vol.train[[i]]$volatility[11:15]
    for (t in 1:5) {
      # Compute mean volatility for the last 5 observations
      mean.vol <- sum(latest_obs[[i]])/5
      # Create data frame with updated vol_1 and mean_vol_5
      list_HAV1_cluster[[i]] <- data.frame(
        vol_1 = latest_obs[[i]][5],
        mean_vol_5 = mean.vol
      )
      pred.hav[[t]] <- unname(predict(HAV.wls.models[[i]], newdata = list_HAV1_cluster[[i]]))
      # Drop the oldest observation and add new predicted value 
      latest_obs[[i]] <- c(latest_obs[[i]][-1], pred.hav[[t]])
   }
   #cluster_pred_lm[[j]][[i]] <- latest_obs
 }
 pred.hav.all[[j]] <- latest_obs
}

#pred.hav.all[[1]][[1]][[1]]
pred3 <- list()
for (i in 1:length(vol)){
  pred3[[i]] <- pred.hav.all[[1]][[i]]
}

mix

cluster 1,3 = EGARCH + WLR

cluster 2,4 = HAV+ WLR

Code
mix <- list()

for(i in 1:length(vol)){
  pred_f <- rep(1,4)
  mod_a <- rep(1,4)
  mod_b <- rep(1,4)
  alpha_w <- rep(1,4)
  beta_w <- rep(1,4)
  val <- rep(1,4)
  time <- c(17,18,19,20)
  
  mix[[i]] <- data.frame(time,pred_f,mod_a,mod_b,alpha_w,beta_w,val)
  
  ###val
  mix[[i]]$val <- vol.val[[i]]$volatility[2:5]
  
  
  if(cluster_l[[i]] == 2 | cluster_l[[i]] == 4){
    mix[[i]]$mod_a <- pred3[[i]][2:5]
    mix[[i]]$mod_b <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
    
    pa <- garch_weight[[i]]
    pb <- pred2[[i]][[1]]
  }
  
  
  else {
    mix[[i]]$mod_a <- pred1_adjust[[i]]$fitted
    mix[[i]]$mod_b <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
    pa <- garch_weight[[i]]
    pb <- pred2[[i]][[1]]
  }
  
 
  #16 -> 17
  a = 0
  b = 1
  sm_err = 99999
  best_a = 0
  for(w in 1:11){
    m_cal <- a*pa + b*pb
    ab_err <- abs(m_cal - vol.val[[i]]$volatility[[1]])
    if(ab_err < sm_err){
      sm_err <- ab_err
      best_a <- a
    }
    a <- a+0.1
    b <- b-0.1
  }
  mix[[i]]$alpha_w[[1]] <- best_a
  mix[[i]]$beta_w[[1]] <- round(1-best_a,digit = 1)
  
  #17-19 -> 18-20 alpha
  for(j in 1:3){
    a = 0
    b = 1
    sm_err = 99999
    best_a = 0
    for(w in 1:11){
      m_cal <- a*mix[[i]]$mod_a[[j]] + b*mix[[i]]$mod_b[[j]]
      ab_err <- abs(m_cal - vol.val[[i]]$volatility[[j+1]])
      if(ab_err < sm_err){
        sm_err <- ab_err
        best_a <- a
      }
      a <- a+0.1
      b <- b-0.1
    }
    mix[[i]]$alpha_w[[j+1]] <- best_a
    mix[[i]]$beta_w[[j+1]] <- round(1-best_a,digit = 1)
  }
  ###mix
  mix[[i]]$pred_f <- ((mix[[i]]$mod_a*mix[[i]]$alpha_w) + (mix[[i]]$mod_b*mix[[i]]$beta_w))
  
}

result

value table

Code
mix
[[1]]
  time       pred_f       mod_a        mod_b alpha_w beta_w          val
1   17 0.0010990287 0.001099029 0.0012196350       1      0 0.0015946080
2   18 0.0009863879 0.001091498 0.0009863879       0      1 0.0007010344
3   19 0.0007984031 0.001091033 0.0007984031       0      1 0.0007203055
4   20 0.0011301855 0.001095010 0.0011301855       0      1 0.0008264389

[[2]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 3.861298e-05 0.0006609389 3.861298e-05     0.0    1.0 0.0006225340
2   18 6.215609e-04 0.0006624486 2.535723e-04     0.9    0.1 0.0003323998
3   19 3.356456e-04 0.0006626192 2.539023e-04     0.2    0.8 0.0001328152
4   20 1.260451e-04 0.0006713540 1.260451e-04     0.0    1.0 0.0001088957

[[3]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0011106308 0.0006334167 0.0011106308     0.0    1.0 0.0011320570
2   18 0.0009069263 0.0006068875 0.0009069263     0.0    1.0 0.0010788897
3   19 0.0010573743 0.0006246518 0.0010573743     0.0    1.0 0.0007937135
4   20 0.0007921832 0.0006227508 0.0010463319     0.6    0.4 0.0007345000

[[4]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0006207086 0.0002209370 0.0006207086     0.0    1.0 0.0004778847
2   18 0.0004359080 0.0002191252 0.0005804300     0.4    0.6 0.0005119734
3   19 0.0004927179 0.0002178328 0.0005614391     0.2    0.8 0.0007271090
4   20 0.0006149317 0.0002167933 0.0006149317     0.0    1.0 0.0001894792

[[5]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0004222089 0.0006729745 0.0004222089       0      1 0.0001584394
2   18 0.0003642260 0.0006676014 0.0003642260       0      1 0.0007691548
3   19 0.0006641943 0.0006641943 0.0004001583       1      0 0.0006672344
4   20 0.0006670953 0.0006670953 0.0003852155       1      0 0.0008105287

plot

Code
all_plot <- list()
for(i in 1:length(vol)){
  weight_p = ""
  for(j in 1:nrow(mix[[i]])){
    weight_p = paste(
      weight_p,as.character(mix[[i]]$time[[j]]),"=","(",
      as.character(mix[[i]]$alpha_w[[j]],1),":",as.character(mix[[i]]$beta_w[[j]],1),")"
      )
    }
  all_plot[[i]] <- ggplot(mix[[i]], aes(x=time)) +
    geom_line(aes(y = val,color = "Real Volatility"))+
    geom_line(aes(y = mod_a,color = "Model a"), linetype="twodash")+
    geom_line(aes(y = mod_b,color = "Model b"), linetype="twodash")+
    geom_line(aes(y = pred_f,color = "Mix Model"), linetype="twodash")+
    scale_color_manual(name = "Model", values = c(
      "Real Volatility" = "red",
      "Model a"="lightblue",
      "Model b"="green",
      "Mix Model" = "blue"))+
    theme_classic()+
    labs(
      title = paste("Prediction Result"),
      tag = as.character(i),
      subtitle = paste("cluster ",as.character(cluster_l[[i]]),
                       if(cluster_l[[i]] == 2|cluster_l[[i]] == 4){mod = "HAV + WLR"}
                       else {mod = "EGARCH + WLR"},
                       "\n weight for each time interval : \n",
                       weight_p
                       ),
      x = "Time interval",
      y = "Volatility",
      caption = "each time interval = 30 seconds"
         )
}
Code
all_plot
[[1]]


[[2]]


[[3]]


[[4]]


[[5]]

animation plot

Code
all_plot[[1]]+transition_reveal(time)

Code
all_plot[[2]]+transition_reveal(time)

Code
all_plot[[3]]+transition_reveal(time)

Code
all_plot[[4]]+transition_reveal(time)

Code
all_plot[[5]]+transition_reveal(time)

5 time id

Code
test_df <- list()
for(i in 1:length(vol)){
  pred_f <- rep(NaN,16)
  mod_a <- rep(NaN,16)
  mod_b <- rep(NaN,16)
  alpha_w <- rep(NaN,16)
  beta_w <- rep(NaN,16)
  val <- rep(NaN,16)
  time <- c(1:16)
  test_df[[i]] <- data.frame(time,pred_f,mod_a,mod_b,alpha_w,beta_w,val)
  test_df[[i]]$val <- vol.train[[i]]$volatility[1:16]
  test_df[[i]]$val[16] <- vol.val[[i]]$volatility[1]
}
agg_mix <- data.frame(test_df[[1]])
agg_mix <- rbind(agg_mix,mix[[1]])


for(i in 2:length(mix)){
  agg_mix <- rbind(agg_mix,test_df[[i]])
  agg_mix <- rbind(agg_mix,mix[[i]])
}
agg_mix$time <- c(1:nrow(agg_mix))


agg_mix$c1 <- rep(NaN,nrow(agg_mix))
agg_mix$c2 <- rep(NaN,nrow(agg_mix))
agg_mix$c3 <- rep(NaN,nrow(agg_mix))
agg_mix$c4 <- rep(NaN,nrow(agg_mix))

for(i in 1:length(vol)){
  if(cluster_l[[i]]==1){
    agg_mix$c1[(((i-1)*20) +1):(i*20)] <- agg_mix$val[(((i-1)*20) +1):(i*20)]}
  if(cluster_l[[i]]==2){
    agg_mix$c2[(((i-1)*20) +1):(i*20)] <- agg_mix$val[(((i-1)*20) +1):(i*20)]}
  if(cluster_l[[i]]==3){
    agg_mix$c3[(((i-1)*20) +1):(i*20)] <- agg_mix$val[(((i-1)*20) +1):(i*20)]}
  if(cluster_l[[i]]==4){
    agg_mix$c4[(((i-1)*20) +1):(i*20)] <- agg_mix$val[(((i-1)*20) +1):(i*20)]}
}
Code
agg_plot <- ggplot(agg_mix, aes(x=time)) +
  geom_line(aes(y = val,color = "Volatility"))+
  geom_line(aes(y = c1,color = "Cluster 1"))+
  geom_line(aes(y = c2,color = "Cluster 2"))+
  geom_line(aes(y = c3,color = "Cluster 3"))+
  geom_line(aes(y = c4,color = "Cluster 4"))+
  geom_line(aes(y = pred_f,color = "Prediction"), linetype="twodash")+
  scale_color_manual(name = "Cluster/Prediction", values = c(
    "Volatility" = "lightgrey",
    "Cluster 1" = "darkgreen",
    "Cluster 2"="gold",
    "Cluster 3"="red",
    "Cluster 4" = "blue",
    "Prediction" = "black"))+
  theme_classic()+
  labs(
    title = paste("Cluster/Prediction Result"),
    x = "Time interval",
    y = "Volatility",
    caption = "each time interval = 30 seconds"
       )

agg_plot+transition_reveal(time)